Useful resources:
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
tips = sns.load_dataset('tips') # one of sns built-in datasets
tips.head()
sns.set_style("darkgrid") # nice, ggplot2-like :)
# from IPython.display import set_matplotlib_formats # Change the default image format to a vector format
# set_matplotlib_formats('svg') # https://blakeaw.github.io/2020-05-25-improve-matplotlib-notebook-inline-res/
sns.set(rc={"figure.dpi":100, 'savefig.dpi':100}) # https://blakeaw.github.io/2020-05-25-improve-matplotlib-notebook-inline-res/
sns.set_context('notebook')
sns.distplot(tips['total_bill'], kde = False) # removing KDE - kernel density estimation
sns.distplot(tips['total_bill'], kde = False, bins = 30)
sns.jointplot(x = 'total_bill', y = 'tip', data = tips) # `kind = 'scatter'`
sns.jointplot(x = 'total_bill', y = 'tip', data = tips, kind = 'hex')
pl = sns.jointplot(x = 'total_bill', y = 'tip', data = tips, kind = 'reg', color='royalblue')
pl.annotate(stats.pearsonr) # import scipy.stats as stats
pl = sns.jointplot(x = 'total_bill', y = 'tip', data = tips, kind = 'kde')
pl.annotate(stats.pearsonr)
sns.pairplot(tips, hue = 'sex', palette = 'coolwarm')
sns.rugplot(tips['total_bill'])
sns.distplot(tips['total_bill'], kde = False)
KDE plots replaces every single observation with a Gaussian (Normal) distribution centered around that value. Then, the final line is obtained by summing up these distributions centered around each datapoint.
Great Youtube link for understanding how it words.
Kernel Density Estimation is estimating the probability density function.
The area under the curve is 1, and the probability of a value being between x1 and x2 is the area under the curve between those two points.
sns.kdeplot(tips['total_bill'], shade = True)
sns.rugplot(tips['total_bill'])
sns.kdeplot(tips['total_bill'], bw = 20, label = 'bw = 20')
sns.kdeplot(tips['total_bill'], bw = 10, label = 'bw = 10')
sns.kdeplot(tips['total_bill'], bw = 5, label = 'bw = 5')
sns.kdeplot(tips['total_bill'], bw = 1, label = 'bw = 1')
# Understanding KDE plots step-by-step
dataset = np.random.randn(25) # Create dataset
dataset
# Set up the x-axis for the plot
x_min = dataset.min() - 2
x_max = dataset.max() + 2
# 100 equally spaced points from x_min to x_max
x_axis = np.linspace(x_min,x_max,100)
# Set up the bandwidth, using the Silverman method:
bandwidth = ((4*dataset.std()**5)/(3*len(dataset)))**.2
# Create an empty kernel list
kernel_list = []
# Plot each basis function
for data_point in dataset:
# Create a kernel for each point and append to list # I don't really understand this code
kernel = stats.norm(data_point, bandwidth).pdf(x_axis) # Probability Distribution Function
kernel_list.append(kernel)
#Scale for plotting
kernel = kernel / kernel.max()
kernel = kernel * .4
plt.plot(x_axis, kernel, color = 'grey', alpha = 0.5)
plt.ylim(0, 1)
I should check these two links:
https://www.youtube.com/watch?v=uial-2girHQ&ab_channel=DataDaft
# To get the kde plot we can sum these basis functions.
# Plot the sum of the basis function
sum_of_kde = np.sum(kernel_list, axis = 0)
# Plot figure
fig = plt.plot(x_axis, sum_of_kde, color='indianred')
# Add the initial rugplot
sns.rugplot(dataset, c = 'indianred')
# Get rid of y-tick marks
plt.yticks([])
# Set title
plt.suptitle("Sum of the Basis Functions")
This is based on the forementioned Youtube video.
cars = sns.load_dataset('mpg').dropna()
cars.info() # also useful: `cars.shape`
cars.head()
sns.scatterplot(cars['horsepower'], cars['mpg'])
sns.kdeplot(cars['horsepower'], cars['mpg'], alpha = 0.5)
sns.kdeplot(cars['horsepower'], cars['mpg'], n_levels = 20)
sns.set_style("whitegrid")
sns.kdeplot(cars['horsepower'], cars['mpg'],
n_levels = 20,
cmap= 'Blues',
shade = True)
sns.set_style("whitegrid")
sns.kdeplot(cars['horsepower'], cars['mpg'],
n_levels = 20,
cmap= 'Blues',
shade = True,
shade_lowest = False)
sns.kdeplot(cars['horsepower'], cars['mpg'],
n_levels = 20,
cmap = 'Blues',
shade = True,
shade_lowest = False,
cbar = True)
cyl_4 = cars[cars.cylinders == 4]
cyl_8 = cars[cars.cylinders == 8]
plt.figure(figsize = (8, 6))
sns.kdeplot(cyl_4.horsepower, cyl_4.mpg,
cmap="Blues", shade = True, shade_lowest = False)
sns.kdeplot(cyl_8.horsepower, cyl_8.mpg,
cmap="Reds", shade=True, shade_lowest=False)
plt.xlabel('Horsepower', fontsize = 14)
plt.ylabel('Miles per Gallon (MPG)', fontsize = 14)
# plt.annotate(): (s: str, xy: Tuple[float, float], *args: Any, **kwargs: Any)
# In order to understand this function better, I wrote some code in the next cell
plt.annotate("4 Cylinders", (105, 32), color = 'blue', fontsize = 16, fontweight = 'bold')
plt.annotate("8 Cylinders", (190, 18), color = 'red', fontsize = 16, fontweight = 'bold');
plt.figure(figsize = (8, 6))
plt.annotate('0, 0', (0, 0), color = 'green', fontsize = 10)
plt.annotate('0.2 , 0.2', (0.2 , 0.2), color = 'green', fontsize = 10)
plt.annotate('0.5 , 0.5', (0.5 , 0.5), color = 'green', fontsize = 10)
plt.annotate('0.2 , 0.5', (0.2 , 0.5), color = 'green', fontsize = 10)
plt.annotate('0.5, 0.2', (0.5 , 0.2), color = 'green', fontsize = 10)
plt.annotate('1, 1', (1, 1), color = 'green', fontsize = 10)
# Since this plot ranges from 0 to 1 on both the X and the Y axis, we locate annotations by
# referring to coordinates that lie between 0 and 1
tips.head()
sns.set_style('darkgrid')
sns.barplot(tips['sex'], tips['total_bill']) # by default, we're looking at mean total_bill per gender
sns.barplot(x = 'sex', y = 'total_bill', data = tips, estimator = np.std)
sns.countplot(x = 'sex', data = tips)
A boxplot is a standardized way of displaying the dataset based on the five-number summary: the minimum, the maximum, the sample median, and the first and third quartiles.
sns.boxplot('day', 'total_bill', hue = 'smoker', data = tips) # `hue` is, of course, optional
sns.boxplot('day', 'total_bill', data = tips)
inf = tips[tips['day'] == 'Thur']['total_bill'].describe()
inf
IQR = inf['75%'] - inf['25%']
# useful guide for printing: https://www.delftstack.com/howto/python/python-print-string-and-variable/
print(f"IQR is: {IQR}\n1.5 * IQR = {1.5 * IQR}\n3Q + (1.5 * IQR) = {inf['75%'] + (1.5 * IQR)}")
In this case, the maximum value in this data set is $43.1.
1.5 IQR above the third quartile is $31.72.
The maximum is greater than 1.5 IQR plus the third quartile, so the maximum is an outlier.
Therefore, the upper whisker is drawn at the greatest value smaller than 1.5 IQR above the third quartile.
sns.boxplot('day', 'total_bill', hue = 'smoker', data = tips)
sns.violinplot('day', 'total_bill', hue = 'smoker', data = tips)
sns.violinplot('day', 'total_bill', data = tips, hue = 'smoker', split = True)
sns.boxplot('day', 'total_bill', hue = 'smoker', data = tips)
sns.violinplot('day', 'total_bill', hue = 'smoker', data = tips)
sns.stripplot(x = 'day', y = 'total_bill', data = tips) # default: `jitter = True`
sns.stripplot(x = 'day', y = 'total_bill', data = tips, jitter = False)
sns.stripplot(x = 'day', y = 'total_bill', data = tips, hue = 'sex', dodge = True)
# The `split` parameter has been renamed to `dodge`.
sns.swarmplot(x = 'day', y = 'total_bill', data = tips)
sns.violinplot(x = 'day', y = 'total_bill', data = tips)
sns.swarmplot(x = 'day', y = 'total_bill', data = tips, color = 'black')
catplot - the most general method for plotting categorical data¶sns.catplot(x = 'day', y = 'total_bill', data = tips, kind = 'bar')
# tips = sns.load_dataset('tips')
flights = sns.load_dataset('flights')
tips.head()
flights.head()
tc = tips.corr()
tc
sns.heatmap(tc, cmap = 'BuPu', annot = True) # https://python-graph-gallery.com/92-control-color-in-seaborn-heatmaps
flights
fp = flights.pivot(index = 'month', columns = 'year', values = 'passengers')
fp
cmap = sns.cm.rocket_r # reversing the color scheme
sns.heatmap(fp, cmap = cmap) # https://stackoverflow.com/questions/47461506/how-to-invert-color-of-seaborn-heatmap-colorbar
sns.heatmap(fp, cmap = 'magma', linecolor = 'white', linewidths = '0.3')
sns.clustermap(fp, cmap = 'coolwarm')
sns.clustermap(fp, cmap = 'coolwarm', standard_scale = 1)
iris = sns.load_dataset('iris')
iris.head()
iris['species'].unique()
sns.pairplot(iris)
sns.pairplot(iris, hue = 'species', palette = 'hls') # https://seaborn.pydata.org/tutorial/color_palettes.html
g = sns.PairGrid(iris)
g.map(sns.scatterplot)
g = sns.PairGrid(iris)
g.map_diag(sns.distplot)
g.map_upper(sns.scatterplot)
g.map_lower(sns.kdeplot)
sns.FacetGrid¶tips.head()
g = sns.FacetGrid(data = tips, col = 'time', row = 'smoker')
g.map(sns.distplot, 'total_bill')
g = sns.FacetGrid(data = tips, col = 'time', row = 'smoker')
g.map(plt.scatter, 'total_bill', 'tip').add_legend()
sns.JointGrid¶g = sns.JointGrid(x= 'total_bill', y = 'tip', data = tips)
g = sns.JointGrid(x = 'total_bill', y = 'tip', data = tips)
g = g.plot(sns.regplot, sns.distplot)
tips.head()
sns.lmplot(x = 'total_bill', y = 'tip', data = tips)
sns.lmplot(x = 'total_bill', y = 'tip', data = tips, hue = 'sex')
sns.lmplot(x = 'total_bill', y = 'tip', data = tips, hue = 'sex', markers = ['o', 'v']) # matplotlib
sns.lmplot(x = 'total_bill', y = 'tip', data = tips, hue = 'sex', markers = ['o', 'v'],
scatter_kws = {'s' : 100} ) # direct call to matplotlib
sns.set_context('paper', font_scale = 2)
sns.lmplot(x = 'total_bill', y = 'tip', data = tips, col = 'day', row = 'smoker',
hue='sex', palette = 'coolwarm')
sns.set_context('paper', font_scale = 2)
sns.lmplot(x = 'total_bill', y = 'tip', data = tips, col = 'sex', row = 'time', aspect = 3, height = 3) # ratio between w & h
sns.set_style('ticks') # None, or one of {darkgrid, whitegrid, dark, white, ticks}
sns.countplot(x = 'sex', data = tips)
sns.despine(left = True, bottom = True) # top, right, left, bottom : boolean, optional
plt.figure(figsize = (12, 3)) # Matplotlib works in combination with Seaborn
sns.countplot(x = 'sex', data = tips)
# help(sns.set_context)
sns.set_context('poster', font_scale = 3) # None, or one of {paper, notebook, talk, poster} # `font_scale = 3` (3 times the default)
sns.countplot(x = 'sex', data = tips)
sns.set_context('notebook', font_scale = 1) # None, or one of {paper, notebook, talk, poster} # `font_scale = 3` (3 times the default)
sns.countplot(x = 'sex', data = tips)
sns.lmplot(x = 'total_bill', y = 'tip', data = tips, hue = 'sex', palette = 'seismic')
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set_style('darkgrid')
sns.set_context('notebook', font_scale = 1)
sns.set(rc={"figure.dpi":100, 'savefig.dpi':100})
titanic = sns.load_dataset('titanic')
titanic.head()
pl = sns.jointplot(x = 'fare', y = 'age', data = titanic)
import scipy.stats as stats
pl.annotate(stats.pearsonr)
sns.distplot(titanic['fare'], kde = False)
sns.boxplot(x = 'class', y = 'age', data = titanic)
sns.swarmplot(x = 'class', y = 'age', data = titanic)
titanic.columns
sns.countplot(x = 'sex', data = titanic)
tc = titanic.corr()
sns.heatmap(tc, cmap = 'coolwarm')
fg = sns.FacetGrid(data = titanic, col = 'sex', hue = 'sex')
fg.map(sns.distplot, 'age', kde = False, bins = 10)
fg = sns.FacetGrid(data = titanic, col = 'sex') # This is Jose's solution
fg.map(plt.hist, 'age')
jupyter nbconvert file.ipynb -- to html_toc